\documentclass[graybox]{svmult}

\usepackage{amssymb}
\usepackage{amsmath}
\usepackage{tikz}
\usepackage{soul}  % highlight text using \hl
\usepackage{verbatim}  % multiline comments
\usepackage{subfig}

\begin{document}

\title{Exploring the Universality of the BHP distribution for the Paiva river}
\author{Ricardo Cruz \and Renato Fernandes \and Alberto Pinto}
\institute{
Ricardo Cruz \email{ricardo.pdm.cruz@gmail.com}
\and Renato Fernandes \email{renatodsfernandes@gmail.com}
\and Alberto Pinto, University of Porto \email{aapinto@fc.up.pt}
}

\maketitle

\abstract{
We expand on work by Gon\c{c}alves et al \cite{Goncalves:11} who has developed data transformations to describe the BHP distribution \cite{article:bhp} as a model for the daily flows of the Paiva river (northern Portugal). The BHP distribution has been found to be useful in modeling various natural and human phenomena. In this paper, we complement and compare Gon\c{c}alves et al's BHP fitting to a standard Normal distribution, and we also decompose the flow in patterns and apply a selective fitting for those patterns. We end up with a discussion of how the data transformation parameters may be used to find distribution changes in the river flow, namely due to global warming.
}

\section{Introduction}

Picking up from Gon\c{c}alves et al's \cite{Goncalves:11} latter article on the fitting of the BHP distribution to the Standard's \& Poor index data set, we explore and expand the techniques used in the article. In that article, the authors find the BHP distribution on the daily price flows by finding the optimal $\alpha \in {[0,1]}$ to power the relative daily flows data points that best fits the BHP distribution.

The BHP is a non-parametrized distribution that has been found to be useful in modeling various natural and human phenomena. Examples of such phenomena include width power in steady state systems \cite{article:bhp}, Plasma density and electrostatic turbulent fluxes \cite{artigo3}, Wolf's sunspot numbers \cite{artigo4}, and also in the height and flow of several  rivers (see \cite{artigo5}, \cite{artigo6}, and \cite{artigo7}). We start by applying the same method for the Normal distribution ($\mathcal{N}(0,1)$). We expand on the study by testing the fitting for the states of various embeddings of the flows, by studying the transition probabilities between a state to the others and then summarizing it using Markov Chains. The data we study is from the Paiva river (Portugal) \cite{Goncalves:09}.

The Paiva River is a natural river from Portugal on the northern part of the territory untouched by man, i. e., there are no dams or dikes that influence the river flow, so all its movement is as meant by nature. This kind of flow is supposed to be unpredictable because it is all dependant on the natural effects like rain, heat and others. Another interesting fact about the Paiva River is that its data distribution for the runoff values is very similar to the ones of the stock market prices and by developing tools to study this river, we can also study and compare the data from stock markets.

\section{Data}

The data of the Paiva river comes from source \cite{ref:paiva}, and its time series is presented in figure \ref{fig:paiva-1}.

The data consists in the flows, from the dates of October's 1st of 1946 till September's 13th of 2006. The dataset has no missing values, totalizing $21,913$ records.

\begin{figure}
\centering


\subfloat[Time series of the relative flows of the Paiva river (blue: increases, red: decreases). On the bottom, a close-up of the decreasing flows.\label{fig:paiva-1}]{%
        \includegraphics[width=0.50\textwidth]{paiva-data.pdf}
    }
    \hfill
    \subfloat[Map of Portugal with the location of the Paiva river, inside the rectangle.\label{fig:paiva-2}]{%
        \includegraphics[width=0.45\textwidth]{paiva-map.png}
    }
\caption{Time series of the river relative flows (percent-wise), and respective geolocation.}
\label{fig:paiva}
\end{figure}

All data processing and figures were created using R \cite{ref:R}.

\section{Flow patterns and Markov chains}

BHP is a non-parametric distribution, therefor our parametrization will regard data transformation only. We will evaluate the possibility that different flow patterns are better fit with different parameters. In the particular case of the Paiva river, it does not have dams and its level is generally increased by rain (which is of seldom occorrence but of drastic magnitude increases) and decreases are generally due to evaporation and outflow, and are slow and happen most of the year.

Gon\c{c}alves et al \cite{Goncalves:09} focused their study mainly on the days when the flow decreases. Here we study the impact of the preceding days of rain in the characteristics of the outflow. To do this detailed study, we use the flow patterns of increases and decreases of outflows to build Markov chains that explicitly exhibit the impact of the preceding days of rain.

Let us start by adapting the presentation of the river heights in terms of relative flows. The initial data has the river height $X_t$ of each day, so we compute its relative differences,

$$r_t = \frac {X_{t} - X_{t-1}}{X_{t}}$$

\noindent
Let $\textbf{X}$ be our initial data and let $m \in \mathbb{N}$ be the window of days that we will want to focus on. We set $\textbf{A}_{m}(t)=(X_{t-m}, ..., X_{t-1})$ and $\textbf{B}_{m}(t)=(X_{t-m+1}, ..., X_{t})$ dependent on the time $t$ and we define the difference $\textbf{R}_{m}(t)$ as 

$$\textbf{R}_{m}(t)=\textbf{B}_{m}(t)-\textbf{A}_{m}(t).$$

\noindent
The \emph{flow window} $\mathbf{w}_m=(w_1,\ldots,w_m)$ is a vector where $w_i \in \{-1,1\}$. The \emph{window memory} is $m-1$.
Let  $\textbf{T}(\mathbf{w}_m)$ be the set of all vectors $\textbf{R}_{m}(t)=(R_{t-m+1}, ..., R_{t})$ with the property that 

$$\frac{R_{t-m+i}}{|R_{t-m+i}|}=w_i.$$

\noindent
The value $w_i=-1$ refers to a decreasing flow and the value $w_i=1$ to an increasing flow. In the figures, we shall codify $w_i=-1$ by $-$ and $w_i=1$ by $+$. For example $(++-)$ refers to a succession of three days with a flow increase followed by another increase and then a decrease.

\begin{figure}
\centering
\begin{tikzpicture}
\draw (0-.5,.3) node {\dots};
\draw (0,0) rectangle (1,.6); \draw (.5,.3) node {1.72};
\draw (1,0) rectangle (2,.6); \draw (1+.5,.3) node {-0.20};
\draw (2,0) rectangle (3,.6); \draw (2+.5,.3) node {-0.35};
\draw (3,0) rectangle (4,.6); \draw (3+.5,.3) node {1.66};
\draw (4+.5,.3) node {\dots};
\draw[|-|] (0,-.3) -- (3,-.3); \draw (3/2,-.3) node[below] {$m=3, w=(+--)$};
\draw[|-|] (1,-1) -- (4,-1); \draw (1+3/2,-1) node[below] {$m=3, w=(--+)$};
\end{tikzpicture}
\caption{Illustration of different time windows in the data.}
\end{figure}

In the figure \ref{fig:markov}, we represent data flows in the form of a tree, where the lines show the probabilities of the next $t$ flow being either $-$ or $+$.

We run Fisher's Exact Test for Count Data in order to test how much we need to look back (ie. the memory we need) in order to predict tomorrow's positive or negative flow. We find that when there is a river increase, we can say with a confidence of $0.90$ that the probability of tomorrow's flow increasing again or decreasing is always 50-50. When the river flow decreases, we need to look back up to 8 days, in order to predict tomorrow's flow.

\begin{figure}
\centering
\input{markov.tex}
\caption{Representation of the river flow rates (up to memory 4)}
\label{fig:markov}
\end{figure}

\section{Flow patterns versus BHP and Normal fitting}

Gon\c{c}alves \cite{Goncalves:09} models the fluctuations of the Paiva river by applying a non-linear transformations to the absolute value of the relative flows $|r_t|^{\alpha}$ and then normalize those values:

\begin{equation}
\textbf{X}_{\alpha,\mathbf{w}_m}=\left \{ \frac{|r_t|^{\alpha}-\mu_\alpha}{\sigma_\alpha}:t \in \textbf{T}(\mathbf{w}_m)\right \}
\label{eq:alpha-transform}
\end{equation}

where $\mu_\alpha$, $\sigma_\alpha$ are the mean and standard deviation of $|r_t|^{\alpha}$.

Instead of the previous transformation, we use the more widely popular power transform \cite{power-transform}, based on the Cox-Box transform with the extra of being divided by the geometric mean (GM) in order to cancel out the units:

$$\textbf{X}_{\lambda,\mathbf{w}_m}=\left \{ \frac{Y_{\lambda,\mathbf{w}_m}(t)-\mu_\lambda}{\sigma_\lambda}:t \in \textbf{T}(\mathbf{w}_m)\right \}
$$

where

$$Y_{\lambda,\mathbf{w}_m}(t)=\frac{|r_t|^{\lambda}-1}{\lambda \text{GM}(Y_{\lambda,\mathbf{w}_m})}$$

and $\mu_\lambda$, $\sigma_\lambda$ are the mean and standard deviation of $Y_{\lambda,\mathbf{w}_m}$

\noindent
Interesting properties are that when $\lambda=1$ the only transformation is the normalization of the data ($\mu=1$ and $\sigma=1$), and as $\lambda \to 0$, the transformation is a log-transformation of the data.

Of curious note is that the results from the power transform do not differ by much from Gon\c{c}alves' transformation (\ref{eq:alpha-transform}). The models will be fittings of these fluctuations to the BHP and the standard Normal distributions.

\begin{figure}
\centering
\includegraphics[width=\linewidth]{paiva-lambdas-hist-0.pdf}
\caption{Fitting of several lambdas for the Paiva river for $\mathbf{w}_1 = (-)$ (decreasing flows of memory 1). Top: the resulting transformation on the time series. Bottom: the histogram (in red: $\mathcal{N}(0,1)$, and blue: BHP). Note: $\lambda = 1$ implies only an affine transformation of the data.}
\end{figure}

The $\lambda$ we will use is not just any number, it is the $\lambda^{*}$ that maximizes a certain goodness of fit test for the BHP distribution and for the Normal distributions. It might be interesting to use more theoretical distributions, but that would require the addition of extra parameters.

As we want to apply goodness of fit tests to samples of different flow patterns and compare those values, we avoided the Kolmogorov-Smirnov test since it is sensitive to sample size. Therefor we use only the Kolmogorov-Smirnov statistic, and we will use the maximum distance between the theoretical and empirical distributions: $\sup_x | X_{\lambda}(x) - D(x)|$, where $D$ represent the theoretical distribution we are testing against (either the BHP or the Normal).

\section{Studying flow memories}

\begin{figure}
\centering
\includegraphics[width=\linewidth]{paiva-lambdas-norm.pdf}
\caption{The Kolmogorov-Smirnov statistic for the Normal distribution when using different lambdas. $\lambda^*$ (for which the statistic is minimize) is represented as a red dot.}
\end{figure}

\begin{figure}
\centering
\includegraphics[width=\linewidth]{paiva-lambdas-bhp.pdf}
\caption{The Kolmogorov-Smirnov statistic for the BHP distribution when using different lambdas. $\lambda^*$ (for which the statistic is minimize) is represented as a red dot.}
\end{figure}


We will now plot in figure \ref{fig:paiva-mem-fit} the $\lambda^*$ fitting parameter as we look back into ever larger segments of always decreasing or always increasing flows. We minimize theoretical and empirical data distances in the first figure, using the Kolmogorov-Smirnov statistic. For each data flow: the figure shows lambda evolving as we increase the memory of the flow pattern; $\mathbf{w}_1=(-)$, $\mathbf{w}_2=(--)$, $\mathbf{w}_3=(---)$ and so on. Each color represents a different distribution.

Interestingly, a pattern seems to ensue for river decreases. In the case of the Paiva river, as we restrict our data into longer periods of flow decreases, alpha starts tending to 1 for the BHP distribution, meaning no non-linear manipulation is required for a good fit. $\lambda^*$ for positive river flow is not monotonic, but is surprisingly well behaved as well.

\begin{figure}
\centering
\includegraphics[width=\linewidth]{paiva-mem-fit.pdf}
\caption{Comparing the alpha parameter evolution (and the result of our goodness of fit test: the lower the better) as we increase the memory for Paiva river. Left: decreasing flows. Right: increasing flows.}
\label{fig:paiva-mem-fit}
\end{figure}

The sample size of our data gets smaller and smaller as we increase the memory window, especially for increases since there are less occurrences of those, see figure \ref{fig:paiva-1}; still in the worst case scenario, we still have $273$ data points for increases when memory is $12$.

\section{Conclusions}

From figure \ref{fig:paiva-mem-fit}, the BHP seems a better fit (according to the Kolmogorov-Smirnov statistic) when predicting the next flow, when using lower memories of the river flows. As we increase memory, $\lambda^*$ for the BHP tends to $1$, meaning no transformation is required to model river flows using the BHP as we use more information from the river past. It was was also seen through the Fisher's Exact Test that for decreasing flows, more memory was needed for better predictions.

By modeling the river flows using the BHP, while adjusting the lambda parameter for the data transformation, it would be interesting to identify changes in the river distribution by detecting changes in the lambda parameter. For instance, we have noticed that $\lambda^*$ has been changing as we cross section the data in time, see figures \ref{fig:paiva-lambdas-time-mem1} and \ref{fig:paiva-lambdas-time-mem6}. This may be a result of factors such as global warming.

\begin{figure}
\centering
\includegraphics[width=\linewidth]{paiva-lambdas-time-0.pdf}
\caption{We fit $\lambda^*$ across time sections (for the BHP) for decreases of memory 1 $(-)$. For each year, we estimate $\lambda^*$ by looking at the distribution of flows within the preceding 10 years up to that year. The red horizontal line is $\lambda^*$ across all years.}
\label{fig:paiva-lambdas-time-mem1}
\end{figure}

\begin{figure}
\centering
\includegraphics[width=\linewidth]{paiva-lambdas-time-000000.pdf}
\caption{Same $\lambda^*$ BHP-fit as in figure \ref{fig:paiva-lambdas-time-mem1} but for decreases of memory 6 $(------)$. For each year, we estimate $\lambda^*$ by looking at the distribution of flows within the preceding 10 years up to that year. The red horizontal line is $\lambda^*$ across all years.}
\label{fig:paiva-lambdas-time-mem6}
\end{figure}



\begin{thebibliography}{99}

\bibitem{Goncalves:11}
  R. Gon\c{c}alves, H. Ferreira \& A. A. Pinto,
  \emph{Universality in the stock exchange
market}.
  Journal of Difference Equations and
Applications,
  2011.

\bibitem{Goncalves:09}
  R. Gon\c{c}alves, H. Ferreira, A. A. Pinto \& N. Stollenwerk,
  \emph{Universality in nonlinear prediction of
complex systems}.
  Journal of Difference Equations and
Applications,
  2009.

\bibitem{article:bhp}
  S. T. Bramwell, P. C. W. Holdsworth 2 \& J. F. Pinton,
  \emph{Universality of rare fluctuations in turbulence and critical phenomena}.
  Nature, Volume 396,
  10 December 1998.

\bibitem{artigo2}
  S. T. Bramwell, J. Y. Fortin, P. C. W. Holdsworth, S. Peysson, J. F. Pinton, B. Portelli, and M. Sellitto,
  \emph{Magnetic fluctuations in the classical XY model: The origin of an exponential tail in a complex system}.
  Physical Review, Volume 63,
  2001.

\bibitem{artigo3}
	B.Ph. van Milligen, R. S\'{a}nchez, B.A. Carreras, V.E. Lynch, B. LaBombard, M.A. Pedrosa, C. Hidalgo, B. Gon{c}calves, R. Balb\'{i}n, and the W7-A5 team,
    \emph{Additional evidence for the universality of turbulent fluctuations and fluxes in the scrape-off layer of tokamaks and
stellarators}.
	Phys. Plasmas 12, 2005.

\bibitem{artigo4}
	R. Gon{c}calves, A. Pinto, and N. Stollenwerk,
    \emph{Cycles and universality in sunspot numbers fluctuations}.
	Astrophys. J. 691, pp. 1583-1586, 2009.

\bibitem{artigo5}
	S.T. Bramwell, K. Christensen, J.Y. Fortin, P.C.W. Holdsworth, H.J. Jensen, S. Lise, J.M. L\'{o}pez, M. Nicodemi, and M. Sellitto,
    \emph{Universal fluctuations in correlated systems}.
	Phys. Rev. Lett. 84, pp. 3744-3747, 2001.

\bibitem{artigo6}
	S.T. Bramwell, T. Fennell, P.C.W. Holdsworth, and B. Portelli,
    \emph{Universal fluctuations of the danube water level: A link with turbulence, criticality and company growth}.
	Europhys. Lett. 57, p. 310, 2002.

\bibitem{artigo7}
	K. Dahlstedt and H.J. Jensen,
    \emph{Fluctuation spectrum and size scaling of river flow and level}.
	Phys. A 348, pp. 596-610, 2005.

\bibitem{power-transform}
    Box, George E. P. and Cox, D. R.,
    \emph{An analysis of transformations}.
    Journal of the Royal Statistical Society, Series B 26 (2): 211–252. JSTOR 2984418. MR 192611, 1964.

\bibitem{ref:paiva}
  Paiva river data from
  \emph{Sistema Nacional de Informa\c{c}\~{a}o de Recursos H\'{i}dricos}.
  \url{www.snirh.pt}.

\bibitem{ref:R}
  R Core Team,
  \emph{R: A Language and Environment for Statistical Computing}.
  R Foundation for Statistical Computing,
  2014.

\end{thebibliography}

\end{document}

